\relax 
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldcontentsline\contentsline
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\contentsline\oldcontentsline
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax 
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\citation{power2022grokking}
\citation{goodfellow2016deep}
\citation{paszke2019pytorch}
\citation{loshchilov2017adamw}
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
\newlabel{sec:intro}{{1}{1}{Introduction}{section.1}{}}
\newlabel{sec:intro@cref}{{[section][1][]1}{[1][1][]1}}
\citation{Ko2022NotAL}
\citation{Bahamou2023LayerwiseAS}
\citation{Shea2024WhyLS}
\citation{Hu2021LoRALA}
\@writefile{toc}{\contentsline {section}{\numberline {2}Related Work}{2}{section.2}\protected@file@percent }
\newlabel{sec:related}{{2}{2}{Related Work}{section.2}{}}
\newlabel{sec:related@cref}{{[section][2][]2}{[1][2][]2}}
\@writefile{toc}{\contentsline {paragraph}{Layer-wise Learning Rate Adaptation:}{2}{section*.1}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Optimization in Transformer Models:}{2}{section*.2}\protected@file@percent }
\citation{power2022grokking}
\citation{vaswani2017attention}
\citation{power2022grokking}
\citation{goodfellow2016deep}
\citation{kingma2014adam}
\@writefile{toc}{\contentsline {paragraph}{Grokking and Generalization:}{3}{section*.3}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Algorithmic Learning Tasks:}{3}{section*.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Background}{3}{section.3}\protected@file@percent }
\newlabel{sec:background}{{3}{3}{Background}{section.3}{}}
\newlabel{sec:background@cref}{{[section][3][]3}{[1][3][]3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Problem Setting}{3}{subsection.3.1}\protected@file@percent }
\citation{loshchilov2017adamw}
\@writefile{toc}{\contentsline {section}{\numberline {4}Method}{4}{section.4}\protected@file@percent }
\newlabel{sec:method}{{4}{4}{Method}{section.4}{}}
\newlabel{sec:method@cref}{{[section][4][]4}{[1][4][]4}}
\citation{vaswani2017attention}
\citation{paszke2019pytorch}
\citation{ba2016layer}
\citation{loshchilov2017adamw}
\@writefile{toc}{\contentsline {section}{\numberline {5}Experimental Setup}{5}{section.5}\protected@file@percent }
\newlabel{sec:experimental}{{5}{5}{Experimental Setup}{section.5}{}}
\newlabel{sec:experimental@cref}{{[section][5][]5}{[1][5][]5}}
\@writefile{toc}{\contentsline {paragraph}{Tasks and Datasets:}{5}{section*.5}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Model Architecture:}{5}{section*.6}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Training Configuration:}{5}{section*.7}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Evaluation Metrics:}{6}{section*.8}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Implementation Details:}{6}{section*.9}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Baseline Comparison:}{6}{section*.10}\protected@file@percent }
\@writefile{toc}{\contentsline {paragraph}{Experimental Process:}{6}{section*.11}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}Results}{6}{section.6}\protected@file@percent }
\newlabel{sec:results}{{6}{6}{Results}{section.6}{}}
\newlabel{sec:results@cref}{{[section][6][]6}{[1][6][]6}}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Summary of results comparing baseline uniform learning rate approach (Run 0) with our layer-wise learning rate strategy (Run 3) across all tasks. *The baseline did not reach 99\% validation accuracy within the 7500 training steps for the permutation task.\relax }}{6}{table.caption.12}\protected@file@percent }
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{tab:results_summary}{{1}{6}{Summary of results comparing baseline uniform learning rate approach (Run 0) with our layer-wise learning rate strategy (Run 3) across all tasks. *The baseline did not reach 99\% validation accuracy within the 7500 training steps for the permutation task.\relax }{table.caption.12}{}}
\newlabel{tab:results_summary@cref}{{[table][1][]1}{[1][6][]6}}
\newlabel{fig:val_acc_div}{{1a}{7}{Modular Division\relax }{figure.caption.13}{}}
\newlabel{fig:val_acc_div@cref}{{[subfigure][1][1]1a}{[1][7][]7}}
\newlabel{sub@fig:val_acc_div}{{a}{7}{Modular Division\relax }{figure.caption.13}{}}
\newlabel{sub@fig:val_acc_div@cref}{{[subfigure][1][1]1a}{[1][7][]7}}
\newlabel{fig:val_acc_sub}{{1b}{7}{Modular Subtraction\relax }{figure.caption.13}{}}
\newlabel{fig:val_acc_sub@cref}{{[subfigure][2][1]1b}{[1][7][]7}}
\newlabel{sub@fig:val_acc_sub}{{b}{7}{Modular Subtraction\relax }{figure.caption.13}{}}
\newlabel{sub@fig:val_acc_sub@cref}{{[subfigure][2][1]1b}{[1][7][]7}}
\newlabel{fig:val_acc_add}{{1c}{7}{Modular Addition\relax }{figure.caption.13}{}}
\newlabel{fig:val_acc_add@cref}{{[subfigure][3][1]1c}{[1][7][]7}}
\newlabel{sub@fig:val_acc_add}{{c}{7}{Modular Addition\relax }{figure.caption.13}{}}
\newlabel{sub@fig:val_acc_add@cref}{{[subfigure][3][1]1c}{[1][7][]7}}
\newlabel{fig:val_acc_perm}{{1d}{7}{Permutation\relax }{figure.caption.13}{}}
\newlabel{fig:val_acc_perm@cref}{{[subfigure][4][1]1d}{[1][7][]7}}
\newlabel{sub@fig:val_acc_perm}{{d}{7}{Permutation\relax }{figure.caption.13}{}}
\newlabel{sub@fig:val_acc_perm@cref}{{[subfigure][4][1]1d}{[1][7][]7}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Validation accuracy curves for all tasks, comparing baseline (Run 0) and layer-wise learning rate approaches (Run 3).\relax }}{7}{figure.caption.13}\protected@file@percent }
\newlabel{fig:all_tasks}{{1}{7}{Validation accuracy curves for all tasks, comparing baseline (Run 0) and layer-wise learning rate approaches (Run 3).\relax }{figure.caption.13}{}}
\newlabel{fig:all_tasks@cref}{{[figure][1][]1}{[1][7][]7}}
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Ablation study results for the permutation task, comparing our full method against variants with partially uniform learning rates.\relax }}{7}{table.caption.14}\protected@file@percent }
\newlabel{tab:ablation}{{2}{7}{Ablation study results for the permutation task, comparing our full method against variants with partially uniform learning rates.\relax }{table.caption.14}{}}
\newlabel{tab:ablation@cref}{{[table][2][]2}{[1][7][]7}}
\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{8}{section.7}\protected@file@percent }
\newlabel{sec:conclusion}{{7}{8}{Conclusion}{section.7}{}}
\newlabel{sec:conclusion@cref}{{[section][7][]7}{[1][8][]8}}
\bibstyle{iclr2024_conference}
\bibdata{references}
\bibcite{ba2016layer}{{1}{2016}{{Ba et~al.}}{{Ba, Kiros, and Hinton}}}
\bibcite{Bahamou2023LayerwiseAS}{{2}{2023}{{Bahamou \& Goldfarb}}{{Bahamou and Goldfarb}}}
\bibcite{goodfellow2016deep}{{3}{2016}{{Goodfellow et~al.}}{{Goodfellow, Bengio, Courville, and Bengio}}}
\bibcite{Hu2021LoRALA}{{4}{2021}{{Hu et~al.}}{{Hu, Shen, Wallis, Allen-Zhu, Li, Wang, and Chen}}}
\bibcite{kingma2014adam}{{5}{2014}{{Kingma \& Ba}}{{Kingma and Ba}}}
\bibcite{Ko2022NotAL}{{6}{2022}{{Ko et~al.}}{{Ko, Lee, and Kim}}}
\bibcite{loshchilov2017adamw}{{7}{2017}{{Loshchilov \& Hutter}}{{Loshchilov and Hutter}}}
\bibcite{paszke2019pytorch}{{8}{2019}{{Paszke et~al.}}{{Paszke, Gross, Massa, Lerer, Bradbury, Chanan, Killeen, Lin, Gimelshein, Antiga, et~al.}}}
\bibcite{power2022grokking}{{9}{2022}{{Power et~al.}}{{Power, Burda, Edwards, Babuschkin, and Misra}}}
\bibcite{Shea2024WhyLS}{{10}{2024}{{Shea \& Schmidt}}{{Shea and Schmidt}}}
\bibcite{vaswani2017attention}{{11}{2017}{{Vaswani et~al.}}{{Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin}}}
\ttl@finishall
\gdef \@abspage@last{9}
